package crawler;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import crawler.classes.Article;
import crawler.classes.SinaLogonDog;
import org.jsoup.Jsoup;
import crawler.util.PageUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import static crawler.util.StringUtils.decodeUnicode;


/**
 * 如果爬虫需要抽取Javascript生成的数据，可以使用HtmlUnitDriver
 * HtmlUnitDriver可以用page.getDriver来生成
 *
 * @author wangyu
 */
public class CrawlerWeibo extends DeepCrawler {
    public int wait_time=2000;
    private String type = "weibo";
    public String root_url = "http://s.weibo.com/weibo/";
    public String key_word;
    public String key_url;
    public String article_url;
    public String dst_regex;
    public ArrayList<String> current_urls;
    private Map<String, String> cookies;
    public boolean isUpdate;

    public CrawlerWeibo(String crawlPath, String key_word,boolean isUpdate) {
        super(crawlPath);
        ArticlesService.SaveKeyWord(key_word);
        //初始化关键字
        this.key_word = key_word;
        this.isUpdate = isUpdate;
        this.current_urls = new ArrayList<String>();
        //初始化seed
        String src_url = root_url + URLEncoder.encode(key_word);
        //获取cookies
        String username = "wangyu.gogogo@163.com";
        String password = "wangyu0502";
        Map<String, String> cookies = new HashMap<String, String>();
        try {
            cookies = new SinaLogonDog().getCookies(username, password);
        } catch (IOException e) {
            e.printStackTrace();
        }

        this.cookies = cookies;
        //分析页面把url添加到seed
        ArrayList<String> src_urls = new ArrayList<String>();
        src_urls.add(src_url);
        //测试webdriver获取的微博页面
        String src_html = null;
        try {
            src_html = Jsoup.connect(src_url).cookies(cookies).get().html();
        } catch (IOException e) {
            e.printStackTrace();
        }

        //unicode转中文
        src_html = decodeUnicode(src_html);
        //获取分页信息-正则匹配
        try {
            Pattern regex = Pattern.compile("/weibo/.+?&page=\\d+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
            Matcher regexMatcher = regex.matcher(src_html);
            while (regexMatcher.find()) {
                String tmp ="http://s.weibo.com"+ regexMatcher.group();
                if (!src_urls.contains(tmp)) {
                    System.out.println("待爬取的页面:"+tmp);
                    src_urls.add(tmp);
                }
            }
        } catch (PatternSyntaxException ex) {
            // Syntax error in the regular expression
        }
        //test(src_url);

        this.setSeeds(src_urls);
        //保持cookie
        HttpRequesterImpl httpRequester1= (HttpRequesterImpl) this.getHttpRequester();
        String cookies_string="";
        for(Map.Entry<String,String> cookie:cookies.entrySet()){
            cookies_string+=cookie.getKey()+"="+cookie.getValue()+";";
        }
        httpRequester1.setCookie(cookies_string);
    }


    @Override
    public Links visitAndGetNextLinks(Page page) {
        extractArticles(page);
        try {
            System.out.println("暂停时间:"+wait_time/1000+" 秒");
            Thread.sleep(this.wait_time);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        return null;
    }


    public void extractArticles(Page page) {
        System.out.println("页面:"+page.getUrl());
        System.out.println("采集开始");

        Document doc = Parser.parse(decodeUnicode(page.getHtml()),"utf-8");
        Elements elements = doc.select("script");
        //需要分析的script
        String script = null;
        for (Element elements1 : elements) {

            if (elements1.toString().startsWith("<script>STK && STK.pageletM && STK.pageletM.view({\"pid\":\"pl_weibo_direct\"")) {
                script = elements1.toString();
            }

        }
        //如果采集频率太高，需要解决验证码问题
        if(doc.toString().indexOf("我真滴不是机器人")>-1)
        {
            System.out.println("有验证码限制");
        }
        //找到json中的html字段
        int start_pos = script.indexOf("html");
        int stop_pos = script.lastIndexOf("}");
        String weibo_html = script.substring(start_pos + 7, stop_pos - 1);

        //解析该html片段
        Document weibo_doc = Parser.parse(weibo_html, "utf-8");
        //建立文章对象存储采集值
        List<Article> articles = new ArrayList<Article>();
        Elements weibos = weibo_doc.select(".WB_cardwrap");
        for (Element weibo : weibos) {
            String url = null;
            String title = null;
            String content = null;
            String src_html = null;
            String create_time = null;
            String full_content="";
            String full_html="";
            try {
                //url
                url = weibo.select(".W_texta").select(".feed_from").select(".W_textb").get(1).attr("href");
                //title
                title = weibo.select(".W_texta").select(".comment_txt").attr("nick-name");
                //content
                content = weibo.select(".W_texta").select(".comment_txt").text();
                //src_html
                src_html = weibo.select(".W_texta").select(".comment_txt").html();
                //create_time
                create_time = weibo.select(".W_texta").select(".feed_from").select(".W_textb").get(1).attr("title");

            } catch (Exception e) {
                e.printStackTrace();
            }
            if (url != null &&
                title != null &&
                content != null &&
                src_html != null &&
                create_time != null)
            {
                System.out.println(page.getUrl());
                System.out.println(url);
                Document fulldoc= null;
                try {
                    //解决 Sina Visitor System
                    //参考 http://www.kuqin.com/shuoit/20141017/342699.html
                    fulldoc = Jsoup.connect(url).header("User-agent","spider").timeout(0).get();
                    full_content=fulldoc.text();
                    full_html=fulldoc.html();
                } catch (IOException e) {
                    e.printStackTrace();
                }

                Article article = new Article(this.type, this.key_word, url, title, content, src_html, create_time,full_content,full_html);
                articles.add(article);
                if (onCrawlerVisitListener != null) {
                    onCrawlerVisitListener.onVisit(url);
                }
                this.totalSize+=articles.size();

            }else{
                System.out.println("字段获取失败");
            }

        }
        //url weibo_doc.select(".WB_cardwrap").get(4).select(".W_texta").select(".feed_from").select(".W_textb").get(1).attr("href")
        //title weibo_doc.select(".WB_cardwrap").get(4).select(".W_texta").select(".comment_txt").attr("nick-name")
        //正文 weibo_doc.select(".WB_cardwrap").get(4).select(".W_texta").select(".comment_txt").text()
        //html代码 weibo_doc.select(".WB_cardwrap").get(4).select(".W_texta").select(".comment_txt").html()
        //时间 weibo_doc.select(".WB_cardwrap").get(4).select(".W_texta").select(".feed_from").select(".W_textb").get(1).attr("title")

        int updateSize = ArticlesService.saveMore(articles,isUpdate);
        if(isUpdate){
            if (onCrawlerVisitListener != null) {
                onCrawlerVisitListener.onUpdate(updateSize);
            }
        }

        System.out.println("文章数:"+articles.size());
        System.out.println("采集完成");
    }


    public static void main(String[] args) throws Exception {

        //System.out.println(decodeUnicode("\\u83b7\\u53d6\\u641c\\u7d22\\u6280\\u5de7\\u3002"));
        //自己配置相应的微博账号，不然无法获取更多的文章
        //System.out.println(URLEncoder.encode("林志玲", "utf-8"));
        CrawlerWeibo crawler = new CrawlerWeibo("./tmp", "姚明",false);
        //String content=Jsoup.connect("http://s.weibo.com/weibo/%25E7%258E%258B%25E5%25AE%2587?topnav=1&wvr=6&b=1").get().toString();
        ///String new_content=decodeUnicode(content);
        //现成设置不要太高
        crawler.setThreads(4);
        crawler.start(2);

    }

    public static void test(String url) {
        //获取url的页面 htmlunitdriver
        WebDriver driver = PageUtils.getDriverByUrl(url, BrowserVersion.CHROME);
        String info = driver.getPageSource();
        String new_info = decodeUnicode(info);
        Map<String, String> cookies = null;
        //获取url的页面 jsoup
        try {
            cookies = new SinaLogonDog().getCookies("wangyu.gogogo@163.com", "wangyu0502");
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            String info1 = Jsoup.connect(url).cookies(cookies).get().toString();
            String info2 = decodeUnicode(info1);
            System.out.println(1);
        } catch (IOException e) {
            e.printStackTrace();
        }


    }


}
